Set the working dir
setwd("/mnt/picea/projects/spruce/u2015029/20170824")
Load libraries
suppressPackageStartupMessages(library(DESeq2))
suppressPackageStartupMessages(library(gplots))
suppressPackageStartupMessages(library(parallel))
suppressPackageStartupMessages(library(pander))
suppressPackageStartupMessages(library(RColorBrewer))
suppressPackageStartupMessages(library(scatterplot3d))
suppressPackageStartupMessages(library(tximport))
suppressPackageStartupMessages(library(vsn))
Source some helper functions
source("~/Git/UPSCb/src/R/plot.multidensity.R")
source("~/Git/UPSCb/src/R/featureSelection.R")
Create palettes
pal <- brewer.pal(8,"Dark2")
pal12 <- brewer.pal(12,"Paired")
hpal <- colorRampPalette(c("blue","white","red"))(100)
Register the default plot margin
mar <- par("mar")
Read the sample information
samples <- read.csv2("~/Git/UPSCb/projects/spruce-cone-development/doc/All_samples.csv")
orig <- list.files("kallisto",
recursive = TRUE,
pattern = "abundance.tsv",
full.names = TRUE)
name them
names(orig) <- sub("_sortmerna.*","",
sapply(strsplit(orig, "/"), .subset, 2))
Reorder the sample data.frame according to the way the results were read in and accounting for tech reps
samples <- samples[match(names(orig),samples$Complete_SciLifeID),]
Read the expression at the transcript level
tx <- suppressMessages(tximport(files = orig,
type = "kallisto",
txOut = TRUE))
kg <- round(tx$counts)
Check how many genes are never expressed
sel <- rowSums(kg) == 0
sprintf("%s%% percent (%s) of %s genes are not expressed",
round(sum(sel) * 100/ nrow(kg),digits=1),
sum(sel),
nrow(kg))
## [1] "14.3% percent (9543) of 66632 genes are not expressed"
The cumulative gene coverage is as expected
plot(density(log10(rowMeans(kg))),col=pal[1],
main="gene mean raw counts distribution",
xlab="mean raw counts (log10)")
The same is done for the individual samples colored by sex.
plot.multidensity(lapply(1:ncol(kg),function(k){log10(kg)[,k]}),
col=c(1,pal)[as.integer(samples$Sex)],
legend.x="topright",
legend=levels(samples$Sex),
legend.col=c(1,pal)[1:nlevels(samples$Sex)],
legend.lwd=2,
main="sample raw counts distribution",
xlab="per gene raw counts (log10)")
and by batch.
plot.multidensity(lapply(1:ncol(kg),function(k){log10(kg)[,k]}),
col=pal[as.integer(samples$Batch)],
legend.x="topright",
legend=levels(samples$Batch),
legend.col=pal[1:nlevels(samples$Batch)],
legend.lwd=2,
main="sample raw counts distribution",
xlab="per gene raw counts (log10)")
dir.create(file.path("analysis","kallisto"),showWarnings=FALSE)
write.csv(kg,file="analysis/kallisto/raw-unormalised-gene-expression_data.csv")
For visualization, the data is submitted to a variance stabilization transformation using DESeq2. The dispersion is estimated independently of the sample tissue and replicate ### Original data Create the dds object, without giving any prior on the design
dds.kg <- DESeqDataSetFromMatrix(
countData = kg,
colData = samples[,c("Complete_SciLifeID","ID","Biol_repl_ID","Sex","Type","Sampling","Tree")],
design = ~Complete_SciLifeID)
## converting counts to integer mode
Check the size factors (i.e. the sequencing library size effect)
dds.kg <- estimateSizeFactors(dds.kg)
sizes.kg <- sizeFactors(dds.kg)
names(sizes.kg) <- colnames(kg)
pander(sizes.kg)
| 1_150115_BC6L4PANXX_P1387_101 | 1_150115_BC6L4PANXX_P1387_102 |
|---|---|
| 0.9992 | 1.061 |
| 1_150115_BC6L4PANXX_P1387_103 | 1_150115_BC6L4PANXX_P1387_104 |
|---|---|
| 0.9865 | 0.8314 |
| 1_150115_BC6L4PANXX_P1387_105 | 1_150115_BC6L4PANXX_P1387_106 |
|---|---|
| 0.9708 | 1.109 |
| 1_150115_BC6L4PANXX_P1387_107 | 1_150115_BC6L4PANXX_P1387_108 |
|---|---|
| 1.243 | 0.9822 |
| 1_150115_BC6L4PANXX_P1387_109 | 1_150115_BC6L4PANXX_P1387_110 |
|---|---|
| 0.9398 | 0.9635 |
| 1_150115_BC6L4PANXX_P1387_111 | 1_150115_BC6L4PANXX_P1387_112 |
|---|---|
| 0.7944 | 1.002 |
| 1_150115_BC6L4PANXX_P1387_113 | 1_150115_BC6L4PANXX_P1387_114 |
|---|---|
| 0.8913 | 1.064 |
| 1_150115_BC6L4PANXX_P1387_115 | 1_150115_BC6L4PANXX_P1387_116 |
|---|---|
| 1.193 | 1.088 |
| 1_150115_BC6L4PANXX_P1387_117 | 1_150115_BC6L4PANXX_P1387_118 |
|---|---|
| 0.9988 | 1.007 |
| 1_150115_BC6L4PANXX_P1387_119 | 1_150115_BC6L4PANXX_P1387_120 |
|---|---|
| 0.7673 | 0.87 |
| 1_150115_BC6L4PANXX_P1387_121 | 1_150115_BC6L4PANXX_P1387_122 |
|---|---|
| 1.004 | 1.028 |
| 1_150115_BC6L4PANXX_P1387_123 | 1_150115_BC6L4PANXX_P1387_124 |
|---|---|
| 1.316 | 0.8749 |
| 1_150115_BC6L4PANXX_P1387_125 | 2_150115_BC6L4PANXX_P1387_101 |
|---|---|
| 0.9088 | 1.002 |
| 2_150115_BC6L4PANXX_P1387_102 | 2_150115_BC6L4PANXX_P1387_103 |
|---|---|
| 1.069 | 1.004 |
| 2_150115_BC6L4PANXX_P1387_104 | 2_150115_BC6L4PANXX_P1387_105 |
|---|---|
| 0.8406 | 0.9906 |
| 2_150115_BC6L4PANXX_P1387_106 | 2_150115_BC6L4PANXX_P1387_107 |
|---|---|
| 1.115 | 1.241 |
| 2_150115_BC6L4PANXX_P1387_108 | 2_150115_BC6L4PANXX_P1387_109 |
|---|---|
| 0.9853 | 0.9448 |
| 2_150115_BC6L4PANXX_P1387_110 | 2_150115_BC6L4PANXX_P1387_111 |
|---|---|
| 0.9616 | 0.8109 |
| 2_150115_BC6L4PANXX_P1387_112 | 2_150115_BC6L4PANXX_P1387_113 |
|---|---|
| 1.017 | 0.91 |
| 2_150115_BC6L4PANXX_P1387_114 | 2_150115_BC6L4PANXX_P1387_115 |
|---|---|
| 1.071 | 1.195 |
| 2_150115_BC6L4PANXX_P1387_116 | 2_150115_BC6L4PANXX_P1387_117 |
|---|---|
| 1.09 | 1.011 |
| 2_150115_BC6L4PANXX_P1387_118 | 2_150115_BC6L4PANXX_P1387_119 |
|---|---|
| 1.008 | 0.7804 |
| 2_150115_BC6L4PANXX_P1387_120 | 2_150115_BC6L4PANXX_P1387_121 |
|---|---|
| 0.8813 | 1.029 |
| 2_150115_BC6L4PANXX_P1387_122 | 2_150115_BC6L4PANXX_P1387_123 |
|---|---|
| 1.035 | 1.322 |
| 2_150115_BC6L4PANXX_P1387_124 | 2_150115_BC6L4PANXX_P1387_125 | A10_S10_L001 |
|---|---|---|
| 0.8803 | 0.9183 | 0.6457 |
| A10_S2_L001 | A11_S11_L001 | A11_S3_L001 | A12_S12_L001 | A13_S13_L001 |
|---|---|---|---|---|
| 1.035 | 0.6138 | 0.9185 | 1.736 | 2.439 |
| A14_S14_L002 | A14_S4_L001 | A15_S15_L002 | A16_S16_L002 | A16_S5_L001 |
|---|---|---|---|---|
| 0.8509 | 1.172 | 2.676 | 0.75 | 0.9613 |
| A17_S1_L001 | A18_S18_L002 | A19_S19_L002 | A1_S1_L001 | A20_S20_L002 |
|---|---|---|---|---|
| 0.1163 | 2.023 | 2.948 | 1.612 | 1.863 |
| A21_S21_L002 | A22_S22_L002 | A23_S23_L002 | A24_S24_L002 | A24_S6_L001 |
|---|---|---|---|---|
| 2.944 | 0.9153 | 1.584 | 0.8449 | 1.31 |
| A25_S25_L002 | A26_S26_L003 | A28_S27_L003 | A29_S28_L003 | A2_S2_L001 |
|---|---|---|---|---|
| 1.575 | 1.724 | 3.379 | 1.664 | 0.4734 |
| A30_S29_L003 | A34_S30_L003 | A35_S31_L003 | A36_S32_L003 | A37_S33_L003 |
|---|---|---|---|---|
| 1.08 | 1.828 | 1.191 | 1.9 | 2.101 |
| A38_S34_L003 | A3_S3_L001 | A40_S4_L001 | A41_S5_L001 | A43_S6_L001 |
|---|---|---|---|---|
| 1.097 | 2.109 | 1.47 | 0.803 | 0.979 |
| A43_S8_L001 | A4_S4_L001 | A5_S5_L001 | A6_S6_L001 | A7_S7_L001 | A8_S8_L001 |
|---|---|---|---|---|---|
| 1.313 | 1.186 | 2.279 | 1.563 | 2.798 | 2.145 |
| A9_S1_L001 | A9_S9_L001 | B22_S3_L001 | B23_S35_L003 | B24_S36_L003 |
|---|---|---|---|---|
| 1.192 | 0.7917 | 0.8102 | 1.146 | 1.033 |
| B24_S7_L001 | B26_S37_L003 | B27_S49_L004 | B28_S48_L004 | B30_S46_L004 |
|---|---|---|---|---|
| 2.635 | 0.774 | 0.6297 | 0.9617 | 0.399 |
| B32_S45_L004 | B36_S39_L004 | B37_S40_L004 | B38_S41_L004 | B39_S42_L004 |
|---|---|---|---|---|
| 0.8002 | 2.326 | 1.078 | 1.027 | 2.869 |
| B41_S43_L004 | B42_S44_L004 | B51_S47_L004 | B5_S38_L004 | B7_S17_L002 |
|---|---|---|---|---|
| 0.6541 | 0.7117 | 1.299 | 0.8812 | 0.9664 |
| B8_S2_L001 | totRNA-10_S33_L004 | totRNA-14_S31_L004 | totRNA-14_S8_L001 |
|---|---|---|---|
| 0.6251 | 0.71 | 0.5486 | 0.9464 |
| totRNA-15_S29_L004 | totRNA-16_S30_L004 | totRNA-17_S28_L004 |
|---|---|---|
| 0.9678 | 0.6796 | 0.8932 |
| totRNA-18_S32_L004 | totRNA-18_S9_L001 | totRNA-2_S35_L004 |
|---|---|---|
| 0.4464 | 0.7857 | 0.7076 |
| totRNA-33_S39_L004 | totRNA-34_S36_L004 | totRNA-35_S37_L004 |
|---|---|---|
| 0.4607 | 0.808 | 1.667 |
| totRNA-40_S38_L004 | totRNA-47_S27_L004 | totRNA-47_S7_L001 |
|---|---|---|
| 0.452 | 0.5002 | 0.8702 |
| totRNA-48_S10_L001 | totRNA-48_S34_L004 | totRNA-48_S9_L001 |
|---|---|---|
| 0.9603 | 0.6847 | 0.1121 |
| totRNA-49_S11_L001 | totRNA-49_S40_L004 | totRNA-50_S12_L001 |
|---|---|---|
| 1.099 | 0.6193 | 0.9332 |
| totRNA-50_S41_L004 |
|---|
| 0.441 |
boxplot(sizes.kg, main="Sequencing libraries size factor")
At the gene level
vsd.kg <- varianceStabilizingTransformation(dds.kg, blind=TRUE)
vst.kg <- assay(vsd.kg)
vst.kg <- vst.kg - min(vst.kg)
Validate the VST
meanSdPlot(vst.kg[rowSums(kg)>0,])
Export the vst
write.csv(vst.kg,"analysis/kallisto/library-size-normalized_variance-stabilized_gene-expression_data.csv")
".pca" <- function(vst,fact,lgd="topleft",pal=brewer.pal(8,"Dark2")){
pc <- prcomp(t(vst))
percent <- round(summary(pc)$importance[2,]*100)
#' ### 3 first dimensions
mar=c(5.1,4.1,4.1,2.1)
scatterplot3d(pc$x[,1],
pc$x[,2],
pc$x[,3],
xlab=paste("Comp. 1 (",percent[1],"%)",sep=""),
ylab=paste("Comp. 2 (",percent[2],"%)",sep=""),
zlab=paste("Comp. 3 (",percent[3],"%)",sep=""),
color=pal[as.integer(fact)],
pch=19)
legend(lgd,pch=19,
col=pal[1:nlevels(fact)],
legend=levels(fact))
par(mar=mar)
}
.pca(vst.kg,factor(samples$Batch),lgd="topright")
.pca(vst.kg,factor(samples$Sex),lgd="topright",pal=c(1,pal))
pc <- prcomp(t(vst.kg))
percent <- round(summary(pc)$importance[2,]*100)
Batch
plot(pc$x[,1],
pc$x[,2],
xlab=paste("Comp. 1 (",percent[1],"%)",sep=""),
ylab=paste("Comp. 2 (",percent[2],"%)",sep=""),
col=pal[as.integer(factor(samples$Batch))],
pch=19,
main="Principal Component Analysis",sub="variance stabilized counts")
legend("topright",bty="n",col=pal,levels(factor(samples$Batch)),pch=19)
Sex
plot(pc$x[,1],
pc$x[,2],
xlab=paste("Comp. 1 (",percent[1],"%)",sep=""),
ylab=paste("Comp. 2 (",percent[2],"%)",sep=""),
col=c(1,pal)[as.integer(factor(samples$Sex))],
pch=19,
main="Principal Component Analysis",sub="variance stabilized counts")
legend("topright",bty="n",col=c(1,pal),levels(samples$Sex),pch=19)
Sampling
samples$Date <-
factor(sapply(lapply(strsplit(sub("Oct","10",
sub("Sep","09",
sub("Aug","08",
sub("Jun","06",
sub("May","05",
sub("Apr","04",samples$Sampling)))))),"-"),rev),
paste,collapse="-"))
plot(pc$x[,1],
pc$x[,2],
xlab=paste("Comp. 1 (",percent[1],"%)",sep=""),
ylab=paste("Comp. 2 (",percent[2],"%)",sep=""),
col=pal12[as.integer(factor(samples$Date))],
pch=19,
main="Principal Component Analysis",sub="variance stabilized counts")
legend("topright",bty="n",col=pal12,levels(samples$Date),pch=19)
Batch
plot(pc$x[,2],
pc$x[,3],
xlab=paste("Comp. 2 (",percent[2],"%)",sep=""),
ylab=paste("Comp. 3 (",percent[3],"%)",sep=""),
col=pal[as.integer(factor(samples$Batch))],
pch=19,
main="Principal Component Analysis",sub="variance stabilized counts")
legend("topright",bty="n",col=pal,levels(factor(samples$Batch)),pch=19)
Sex
plot(pc$x[,2],
pc$x[,3],
xlab=paste("Comp. 2 (",percent[2],"%)",sep=""),
ylab=paste("Comp. 3 (",percent[3],"%)",sep=""),
col=c(1,pal)[as.integer(samples$Sex)],
pch=19,
main="Principal Component Analysis",sub="variance stabilized counts")
legend("topright",bty="n",col=c(1,pal),levels(samples$Sex),pch=19)
Sampling
plot(pc$x[,2],
pc$x[,3],
xlab=paste("Comp. 2 (",percent[2],"%)",sep=""),
ylab=paste("Comp. 3 (",percent[3],"%)",sep=""),
col=pal12[as.integer(factor(samples$Date))],
pch=19,
main="Principal Component Analysis",sub="variance stabilized counts")
legend("topright",bty="n",col=pal12,levels(samples$Date),pch=19)
sel <- featureSelect(vst.kg,samples$Biol_repl_ID,exp = 5,nrep = 2)
heatmap.2(vst.kg[sel,],trace = "none",labRow = FALSE,
col=hpal, ColSideColors = c(1,pal)[as.integer(samples$Sex)],
scale="row")
vst.sat <- vst.kg[sel,]
vst.sat[vst.sat<3] <- 3
vst.sat[vst.sat>8] <- 8
heatmap.2(vst.sat,trace = "none",labRow = FALSE,
col=hpal, ColSideColors = c(1,pal)[as.integer(samples$Sex)])
hc <- hclust(dist(t(vst.sat)))
plot(hc, labels=samples$ID,
main = "Hierarchical clustering",cex=0.7)
counts <- do.call(
cbind,
lapply(split.data.frame(t(kg),
samples$ID),
colSums))
csamples <- samples[,-1]
csamples <- csamples[match(colnames(counts),csamples$ID),]
write.csv(csamples,file="~/Git/UPSCb/projects/spruce-cone-development/doc/biological-samples.csv")
The cumulative gene coverage is as expected
plot(density(log10(rowMeans(counts))),col=pal[1],
main="gene mean raw counts distribution",
xlab="mean raw counts (log10)")
The same is done for the individual samples colored by sex.
plot.multidensity(lapply(1:ncol(counts),function(k){log10(counts)[,k]}),
col=c(1,pal)[as.integer(csamples$Sex)],
legend.x="topright",
legend=levels(csamples$Sex),
legend.col=c(1,pal)[1:nlevels(csamples$Sex)],
legend.lwd=2,
main="sample raw counts distribution",
xlab="per gene raw counts (log10)")
and by batch.
plot.multidensity(lapply(1:ncol(counts),function(k){log10(counts)[,k]}),
col=pal[as.integer(csamples$Batch)],
legend.x="topright",
legend=levels(csamples$Batch),
legend.col=pal[1:nlevels(csamples$Batch)],
legend.lwd=2,
main="sample raw counts distribution",
xlab="per gene raw counts (log10)")
Write the count table (tech. rep. combined)
write.csv(counts,file="analysis/kallisto/raw-unormalised-gene-expression-tech-rep-combined_data.csv")
For visualization, the data is submitted to a variance stabilization transformation using DESeq2. The dispersion is estimated independently of the sample tissue and replicate ### Original data Create the dds object, without giving any prior on the design
dds <- DESeqDataSetFromMatrix(
countData = counts,
colData = csamples[,c("ID","Biol_repl_ID","Sex","Type","Sampling","Date","Batch","Tree")],
design = ~Biol_repl_ID)
## converting counts to integer mode
Check the size factors (i.e. the sequencing library size effect)
dds <- estimateSizeFactors(dds)
sizes <- sizeFactors(dds)
names(sizes) <- colnames(counts)
pander(sizes)
| F1-04-10 | F1-08-18 | F1-09-12 | F1-10-08 | F1a-08-01 | F1a-08-19 | F1a-09-16 |
|---|---|---|---|---|---|---|
| 1.21 | 0.6307 | 1.275 | 1.686 | 1.138 | 1.21 | 0.646 |
| F1b-08-01 | F1b-08-19 | F1b-09-16 | F1c-08-01 | F1c-08-19 | F1c-09-16 |
|---|---|---|---|---|---|
| 0.3358 | 0.08201 | 1.12 | 1.492 | 1.429 | 1.525 |
| F1d-08-01 | F2-09-12 | F2-10-08 | F3-08-18 | F3-09-12 | F3-10-08 | F4-09-12 |
|---|---|---|---|---|---|---|
| 1.488 | 1.533 | 1.138 | 1.057 | 1.459 | 1.426 | 1.509 |
| F4-10-08 | F5-09-12 | F5-10-08 | F7-04-29 | F8-04-10 | F8-04-29 | F9-04-10 |
|---|---|---|---|---|---|---|
| 1.425 | 1.417 | 1.435 | 1.238 | 0.283 | 2.603 | 0.446 |
| F9-04-29 | FA1a-08-01 | FA1a-10-18 | FA1b-08-01 | FA1b-10-18 | FA1c-08-01 |
|---|---|---|---|---|---|
| 0.5746 | 1.72 | 2.385 | 1.433 | 1.178 | 1.896 |
| FA1c-10-18 | M1-04-10 | M1-08-18 | M1-09-12 | M1-10-08 | M1a-08-01 | M1b-08-01 |
|---|---|---|---|---|---|---|
| 0.7604 | 0.3266 | 0.9689 | 1.416 | 1.502 | 1.035 | 0.5671 |
| M1c-08-01 | M2-09-12 | M2-10-08 | M3-08-18 | M3-09-12 | M3-10-08 | M4-09-12 |
|---|---|---|---|---|---|---|
| 1.617 | 1.405 | 1.756 | 0.4807 | 1.184 | 1.334 | 1.389 |
| M4-10-08 | M5-09-12 | M5-10-08 | M7-04-29 | M8-04-10 | M8-04-29 | M9-04-10 |
|---|---|---|---|---|---|---|
| 1.387 | 1.572 | 1.361 | 0.5096 | 0.5717 | 0.8169 | 0.6855 |
| M9-04-29 | O7-06-11 | O8-06-11 | O9-06-11 | P7-05-12 | P8-05-12 | P9-05-12 |
|---|---|---|---|---|---|---|
| 0.5533 | 0.5729 | 1.653 | 0.7267 | 0.3276 | 0.4797 | 0.5165 |
| S7-06-11 | S8-06-11 | S9-06-11 | TA1a-10-18 | TA1b-10-18 | TA1c-10-18 |
|---|---|---|---|---|---|
| 1.181 | 0.7675 | 2.03 | 1.189 | 1.077 | 1.219 |
| TL1a -08-01 | TL1b -08-01 | TL1c -08-01 | V1-04-10 | V1-08-18 | V1-10-08 |
|---|---|---|---|---|---|
| 1.982 | 1.516 | 1.405 | 0.9644 | 0.6844 | 1.282 |
| V2-10-08 | V3-08-18 | V3-10-08 | V4-10-08 | V5-10-08 | V7-04-29 | V8-04-10 |
|---|---|---|---|---|---|---|
| 1.239 | 0.8735 | 1.865 | 1.092 | 1.236 | 0.5012 | 0.9124 |
| V8-04-29 | V9-04-10 | V9-04-29 | VL1a-08-01 | VL1a-08-19 | VL1a-09-16 |
|---|---|---|---|---|---|
| 0.4447 | 0.6838 | 0.6241 | 0.842 | 2.084 | 1.114 |
| VL1a-10-25 | VL1b-08-01 | VL1b-08-19 | VL1b-09-16 | VL1b-10-25 | VL1c-08-01 |
|---|---|---|---|---|---|
| 1.289 | 1.614 | 1.327 | 1.221 | 0.8407 | 1.104 |
| VL1c-08-19 | VL1c-10-25 | VL1d-08-01 |
|---|---|---|
| 2.084 | 1.341 | 0.7748 |
boxplot(sizes, main="Sequencing libraries size factor")
pander(sort(sizes))
| F1b-08-19 | F8-04-10 | M1-04-10 | P7-05-12 | F1b-08-01 | V8-04-29 | F9-04-10 |
|---|---|---|---|---|---|---|
| 0.08201 | 0.283 | 0.3266 | 0.3276 | 0.3358 | 0.4447 | 0.446 |
| P8-05-12 | M3-08-18 | V7-04-29 | M7-04-29 | P9-05-12 | M9-04-29 | M1b-08-01 |
|---|---|---|---|---|---|---|
| 0.4797 | 0.4807 | 0.5012 | 0.5096 | 0.5165 | 0.5533 | 0.5671 |
| M8-04-10 | O7-06-11 | F9-04-29 | V9-04-29 | F1-08-18 | F1a-09-16 | V9-04-10 |
|---|---|---|---|---|---|---|
| 0.5717 | 0.5729 | 0.5746 | 0.6241 | 0.6307 | 0.646 | 0.6838 |
| V1-08-18 | M9-04-10 | O9-06-11 | FA1c-10-18 | S8-06-11 | VL1d-08-01 | M8-04-29 |
|---|---|---|---|---|---|---|
| 0.6844 | 0.6855 | 0.7267 | 0.7604 | 0.7675 | 0.7748 | 0.8169 |
| VL1b-10-25 | VL1a-08-01 | V3-08-18 | V8-04-10 | V1-04-10 | M1-08-18 |
|---|---|---|---|---|---|
| 0.8407 | 0.842 | 0.8735 | 0.9124 | 0.9644 | 0.9689 |
| M1a-08-01 | F3-08-18 | TA1b-10-18 | V4-10-08 | VL1c-08-01 | VL1a-09-16 |
|---|---|---|---|---|---|
| 1.035 | 1.057 | 1.077 | 1.092 | 1.104 | 1.114 |
| F1b-09-16 | F2-10-08 | F1a-08-01 | FA1b-10-18 | S7-06-11 | M3-09-12 |
|---|---|---|---|---|---|
| 1.12 | 1.138 | 1.138 | 1.178 | 1.181 | 1.184 |
| TA1a-10-18 | F1a-08-19 | F1-04-10 | TA1c-10-18 | VL1b-09-16 | V5-10-08 |
|---|---|---|---|---|---|
| 1.189 | 1.21 | 1.21 | 1.219 | 1.221 | 1.236 |
| F7-04-29 | V2-10-08 | F1-09-12 | V1-10-08 | VL1a-10-25 | VL1b-08-19 | M3-10-08 |
|---|---|---|---|---|---|---|
| 1.238 | 1.239 | 1.275 | 1.282 | 1.289 | 1.327 | 1.334 |
| VL1c-10-25 | M5-10-08 | M4-10-08 | M4-09-12 | M2-09-12 | TL1c -08-01 |
|---|---|---|---|---|---|
| 1.341 | 1.361 | 1.387 | 1.389 | 1.405 | 1.405 |
| M1-09-12 | F5-09-12 | F4-10-08 | F3-10-08 | F1c-08-19 | FA1b-08-01 | F5-10-08 |
|---|---|---|---|---|---|---|
| 1.416 | 1.417 | 1.425 | 1.426 | 1.429 | 1.433 | 1.435 |
| F3-09-12 | F1d-08-01 | F1c-08-01 | M1-10-08 | F4-09-12 | TL1b -08-01 |
|---|---|---|---|---|---|
| 1.459 | 1.488 | 1.492 | 1.502 | 1.509 | 1.516 |
| F1c-09-16 | F2-09-12 | M5-09-12 | VL1b-08-01 | M1c-08-01 | O8-06-11 | F1-10-08 |
|---|---|---|---|---|---|---|
| 1.525 | 1.533 | 1.572 | 1.614 | 1.617 | 1.653 | 1.686 |
| FA1a-08-01 | M2-10-08 | V3-10-08 | FA1c-08-01 | TL1a -08-01 | S9-06-11 |
|---|---|---|---|---|---|
| 1.72 | 1.756 | 1.865 | 1.896 | 1.982 | 2.03 |
| VL1c-08-19 | VL1a-08-19 | FA1a-10-18 | F8-04-29 |
|---|---|---|---|
| 2.084 | 2.084 | 2.385 | 2.603 |
At the gene level
vsd <- varianceStabilizingTransformation(dds, blind=TRUE)
vst <- assay(vsd)
vst <- vst - min(vst)
Validate the VST
meanSdPlot(vst[rowSums(vst)>0,])
Export the vst
write.csv(vst,"analysis/kallisto/library-size-normalized_variance-stabilized_gene-expression-tech-rep-combined_data.csv")
".pca" <- function(vst,fact,lgd="topleft",pal=brewer.pal(8,"Dark2")){
pc <- prcomp(t(vst))
percent <- round(summary(pc)$importance[2,]*100)
#' ### 3 first dimensions
mar=c(5.1,4.1,4.1,2.1)
scatterplot3d(pc$x[,1],
pc$x[,2],
pc$x[,3],
xlab=paste("Comp. 1 (",percent[1],"%)",sep=""),
ylab=paste("Comp. 2 (",percent[2],"%)",sep=""),
zlab=paste("Comp. 3 (",percent[3],"%)",sep=""),
color=pal[as.integer(fact)],
pch=19)
legend(lgd,pch=19,
col=pal[1:nlevels(fact)],
legend=levels(fact))
par(mar=mar)
}
.pca(vst,factor(csamples$Batch),lgd="topright")
.pca(vst,factor(csamples$Sex),lgd="topright",pal=c(1,pal))
pc <- prcomp(t(vst))
percent <- round(summary(pc)$importance[2,]*100)
Batch
plot(pc$x[,1],
pc$x[,2],
xlab=paste("Comp. 1 (",percent[1],"%)",sep=""),
ylab=paste("Comp. 2 (",percent[2],"%)",sep=""),
col=pal[as.integer(factor(csamples$Batch))],
pch=19,
main="Principal Component Analysis",sub="variance stabilized counts")
legend("topright",bty="n",col=pal,levels(factor(csamples$Batch)),pch=19)
Sex
plot(pc$x[,1],
pc$x[,2],
xlab=paste("Comp. 1 (",percent[1],"%)",sep=""),
ylab=paste("Comp. 2 (",percent[2],"%)",sep=""),
col=c(1,pal)[as.integer(factor(csamples$Sex))],
pch=19,
main="Principal Component Analysis",sub="variance stabilized counts")
legend("topright",bty="n",col=c(1,pal),levels(csamples$Sex),pch=19)
Sampling
plot(pc$x[,1],
pc$x[,2],
xlab=paste("Comp. 1 (",percent[1],"%)",sep=""),
ylab=paste("Comp. 2 (",percent[2],"%)",sep=""),
col=pal12[as.integer(factor(csamples$Date))],
pch=19,
main="Principal Component Analysis",sub="variance stabilized counts")
legend("topright",bty="n",col=pal12,levels(csamples$Date),pch=19)
Batch
plot(pc$x[,2],
pc$x[,3],
xlab=paste("Comp. 2 (",percent[2],"%)",sep=""),
ylab=paste("Comp. 3 (",percent[3],"%)",sep=""),
col=pal[as.integer(factor(csamples$Batch))],
pch=19,
main="Principal Component Analysis",sub="variance stabilized counts")
legend("topright",bty="n",col=pal,levels(factor(csamples$Batch)),pch=19)
Sex
plot(pc$x[,2],
pc$x[,3],
xlab=paste("Comp. 2 (",percent[2],"%)",sep=""),
ylab=paste("Comp. 3 (",percent[3],"%)",sep=""),
col=c(1,pal)[as.integer(csamples$Sex)],
pch=19,
main="Principal Component Analysis",sub="variance stabilized counts")
legend("topright",bty="n",col=c(1,pal),levels(csamples$Sex),pch=19)
Sampling
plot(pc$x[,2],
pc$x[,3],
xlab=paste("Comp. 2 (",percent[2],"%)",sep=""),
ylab=paste("Comp. 3 (",percent[3],"%)",sep=""),
col=pal12[as.integer(factor(csamples$Date))],
pch=19,
main="Principal Component Analysis",sub="variance stabilized counts")
legend("topright",bty="n",col=pal12,levels(csamples$Date),pch=19)
sel <- featureSelect(vst,csamples$Biol_repl_ID,exp = 5,nrep = 2)
heatmap.2(vst[sel,],trace = "none",labRow = FALSE,
col=hpal, ColSideColors = c(1,pal)[as.integer(csamples$Sex)],
scale="row")
vst.sat <- vst[sel,]
vst.sat[vst.sat<3] <- 3
vst.sat[vst.sat>8] <- 8
heatmap.2(vst.sat,trace = "none",labRow = FALSE,
col=hpal, ColSideColors = c(1,pal)[as.integer(csamples$Sex)])
hc <- hclust(dist(t(vst.sat)))
plot(hc, labels=csamples$ID,
main = "Hierarchical clustering",cex=0.7)
plot(hc, labels=round(sizes[csamples$ID],digits = 2),
main = "Hierarchical clustering",cex=0.7)
The data quality looks overall good. There is some covariates that will need to be taken into account in the analysis, principaly the sampling date. This is confounded with the batch effect. Some direct comparisons will be counfonded with time and it might be difficult to disentangle that effect. We could try to block it on the whole dataset, but we will have an unbalanced design.
The technical replicates…
## R version 3.4.0 (2017-04-21)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 16.04.2 LTS
##
## Matrix products: default
## BLAS: /usr/lib/openblas-base/libblas.so.3
## LAPACK: /usr/lib/libopenblasp-r0.2.18.so
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] parallel stats4 stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] hexbin_1.27.1 vsn_3.44.0
## [3] tximport_1.4.0 scatterplot3d_0.3-40
## [5] RColorBrewer_1.1-2 pander_0.6.0
## [7] gplots_3.0.1 DESeq2_1.16.1
## [9] SummarizedExperiment_1.6.3 DelayedArray_0.2.7
## [11] matrixStats_0.52.2 Biobase_2.36.2
## [13] GenomicRanges_1.28.3 GenomeInfoDb_1.12.2
## [15] IRanges_2.10.2 S4Vectors_0.14.3
## [17] BiocGenerics_0.22.0
##
## loaded via a namespace (and not attached):
## [1] bit64_0.9-7 splines_3.4.0
## [3] gtools_3.5.0 Formula_1.2-2
## [5] affy_1.54.0 latticeExtra_0.6-28
## [7] blob_1.1.0 GenomeInfoDbData_0.99.0
## [9] yaml_2.1.14 RSQLite_2.0
## [11] backports_1.1.0 lattice_0.20-35
## [13] limma_3.32.2 digest_0.6.12
## [15] XVector_0.16.0 checkmate_1.8.3
## [17] colorspace_1.3-2 preprocessCore_1.38.1
## [19] htmltools_0.3.6 Matrix_1.2-10
## [21] plyr_1.8.4 XML_3.98-1.9
## [23] genefilter_1.58.1 zlibbioc_1.22.0
## [25] xtable_1.8-2 scales_0.4.1
## [27] gdata_2.18.0 affyio_1.46.0
## [29] BiocParallel_1.10.1 htmlTable_1.9
## [31] tibble_1.3.3 annotate_1.54.0
## [33] ggplot2_2.2.1 nnet_7.3-12
## [35] lazyeval_0.2.0 survival_2.41-3
## [37] magrittr_1.5 memoise_1.1.0
## [39] evaluate_0.10.1 foreign_0.8-69
## [41] BiocInstaller_1.26.0 tools_3.4.0
## [43] data.table_1.10.4 hms_0.3
## [45] stringr_1.2.0 munsell_0.4.3
## [47] locfit_1.5-9.1 cluster_2.0.6
## [49] AnnotationDbi_1.38.1 compiler_3.4.0
## [51] caTools_1.17.1 rlang_0.1.1
## [53] rhdf5_2.20.0 grid_3.4.0
## [55] RCurl_1.95-4.8 htmlwidgets_0.9
## [57] labeling_0.3 bitops_1.0-6
## [59] base64enc_0.1-3 rmarkdown_1.6
## [61] gtable_0.2.0 DBI_0.7
## [63] R6_2.2.2 gridExtra_2.2.1
## [65] knitr_1.16 bit_1.1-12
## [67] Hmisc_4.0-3 rprojroot_1.2
## [69] readr_1.1.1 KernSmooth_2.23-15
## [71] stringi_1.1.5 Rcpp_0.12.11
## [73] geneplotter_1.54.0 rpart_4.1-11
## [75] acepack_1.4.1